load data: 224 subjects, with 791 goals are included in the following analysis

goalRating_long_R <- read.csv("./inputs/goalRating_long_R.csv",stringsAsFactors = F)

indivDiffDf <- read.csv("./inputs/indivDiffDf.csv",stringsAsFactors = F)

goalDf_sum_wide <- read.csv("./inputs/goalDf_wide.csv",stringsAsFactors = F)

Data Screening for goal representation assessment

Missing data

Check the number of missing data per variable, and below is the top 5 variables. Missing data is rare for all variables

# check the number of "I'm not sure" responses per variable
totalGoal <- nrow(goalRating_long_R)/35

goalRating_long_R %>%
  filter(is.na(rating)) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent)) %>%
  head(5)
##         variable n     percent
##       commitment 6 0.006807352
##       importance 5 0.005672793
##  instrumentality 5 0.005672793
##           regret 5 0.005672793
##         conflict 4 0.004538235

The “I’m not sure” response

“construal_level”,“approach_avoidance” and “attainment_maintenance” question have an option for “I’m not sure” because they ask subjects to categorize their goals.

around 2% of the goals had “I’m not sure” as the response.

# check the number of "I'm not sure" responses per varialbe
goalRating_long_R %>%
  filter(rating == 99) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent))
##                  variable  n    percent
##           construal_level 23 0.02609485
##  attainment_maintenance_R 16 0.01815294
##      approach_avoidance_R 15 0.01701838

The “not specified” response

temporal_duration, frequency and end_state_specificity question have an option for “not specified” because they ask about features that may not be applicable to all goals.

The end state specificity is not applicable to around 9% of the goals

# check the number of "not specified" responses per varialbe
goalRating_long_R %>%
  filter(rating == 999) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent))
##                 variable  n    percent
##  end_state_specificity_R 82 0.09303381
##        temporal_duration 43 0.04878602
##              frequency_R 23 0.02609485

Transform all special cases to NAs

All “I’m not sure” and “not specified” responses will be treated as missing data.

# transform 99 & 999 to NAs
goalRating_long_R <- goalRating_long_R %>% 
  mutate(rating = replace(rating, rating == 99 | rating == 999, NA))

The number of claimed goals

Descriptive on the number of goals subject claimed to have prior to listing them (in the SONA study, hte median of claimed goal is 4)

describe(goalDf_sum_wide$total_goal)
##    vars   n mean    sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 224 4.38 13.69      3    3.03 1.48   1 200   199 13.22   183.97 0.91

Visualize the number of claimed goals per subject after excluding the extreme value (> 50) (we have 1 claimed 50, 1 claimed 200)

breaks = (1:20)
goalDf_sum_wide %>% 
  filter(total_goal < 50) %>%
  ggplot(aes(x = total_goal)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of claimed goals", y="# of participants") +
  theme_classic(base_size = 18) 

The percentage of subjects who claimed having more than 5 goals: 6.25%

# get the number of total subject
totalSub <- nrow(indivDiffDf)

length(goalDf_sum_wide$total_goal[goalDf_sum_wide$total_goal>5])/totalSub
## [1] 0.0625

Descriptive on the number of goals participants actual listed (in the SONA study, the mean is 4.13)

describe(goalDf_sum_wide$listNum)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 224 3.53 1.37      3    3.62 1.48   1   5     4 -0.26    -1.35 0.09
breaks <- (1:5)
goalDf_sum_wide %>% 
  ggplot(aes(x = listNum)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=seq(1, 5, by = 1)) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of listed goals", y="# of participants") +
  theme_classic(base_size = 18) 

number of people who listed 1 goal: 15 (SONA study: 1)

length(goalDf_sum_wide$listNum[goalDf_sum_wide$listNum == 1])
## [1] 15

descriptive on the differences between the number of claimed goals and listed goals (after exclude the 2 extreme cases)

goalDf_sum_wide <-goalDf_sum_wide %>%
  mutate(diffNum = total_goal - listNum)

goalDf_sum_wide_clean <- goalDf_sum_wide %>%filter(total_goal < 50)
  
describe(goalDf_sum_wide_clean$diffNum)
##    vars   n  mean sd median trimmed mad min max range skew kurtosis   se
## X1    1 222 -0.23  2      0   -0.25   0  -4  15    19 4.35    31.11 0.13
breaks <- (-4:15)
goalDf_sum_wide_clean %>% 
  ggplot(aes(x = diffNum)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of claimed goals - listed goals", y="# of participants") +
  theme_classic(base_size = 18) 

percentage of people who listed more goals than they claimed: 22.3%

length(goalDf_sum_wide$diffNum[goalDf_sum_wide$diffNum <0])/totalSub *100
## [1] 22.32143

percentage of people who listed less goals more than they claimed: 7.5%

length(goalDf_sum_wide$diffNum[goalDf_sum_wide$diffNum >0])/totalSub *100
## [1] 7.589286

Compared to the SONA study, more people listed more goals than they claimed, which may indecate a priming effect of the goal listing task.

Goal Representation Ratings

Descriptive stats

# descriptive stats for each variable 
goalRating_long_R %>%
  dplyr::select(variable, rating) %>%
  group_by(variable) %>%
  summarize(mean = mean(rating, na.rm = TRUE),
            sd = sd(rating, na.rm = TRUE), 
            n = n(),
            min = min(rating, na.rm = TRUE),
            max = max(rating, na.rm = TRUE),
            skew = skew(rating, na.rm = T), 
            kurtosi = kurtosi(rating, na.rm = T)
            ) %>%
  arrange(skew) %>%
  kable(format = "html", escape = F) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center")
## `summarise()` ungrouping output (override with `.groups` argument)
variable mean sd n min max skew kurtosi
specificity 5.703046 1.4490556 791 1 7 -1.3968834 1.7409083
ideal_motivation 5.762025 1.3421417 791 1 7 -1.3019606 1.7582688
clarity 5.797468 1.1922835 791 1 7 -1.2591632 1.9628696
identified_motivation 6.029077 1.0805128 791 1 7 -1.2361306 1.7466351
control 5.951899 1.1851276 791 1 7 -1.2304266 1.5682491
importance 6.019084 1.1268092 791 1 7 -1.2272333 1.6847051
initial_time_R 6.517067 1.5925140 791 1 8 -1.2246051 1.1945486
basic_needs 5.433460 1.6017743 791 1 7 -1.1377953 0.7998718
attractiveness_achievement 5.937975 1.0536681 791 1 7 -1.1165131 1.5995647
social_desirability 5.936709 1.1069749 791 1 7 -1.0229599 0.9731957
commitment 5.938853 1.1501248 791 1 7 -1.0061179 0.6538724
commonality 5.406329 1.5405661 791 1 7 -0.9758224 0.4364142
attainability 8.474619 2.0460059 791 1 11 -0.9712587 0.8349664
instrumentality 5.367685 1.5585144 791 1 7 -0.9293689 0.2879043
attractiveness_progress 5.655260 1.1130460 791 1 7 -0.9026328 1.0765550
regret 5.368957 1.5700227 791 1 7 -0.8978937 0.1990149
measurability 5.624842 1.4200134 791 1 7 -0.8905870 0.1867125
meaningfulness 5.269377 1.5353726 791 1 7 -0.7894562 0.1375735
temporal_duration 3.076305 0.9396327 791 1 4 -0.7133993 -0.4740105
construal_level 5.002614 1.8016843 791 1 7 -0.6717568 -0.5209142
visibility 4.977128 1.7797233 791 1 7 -0.6661160 -0.4917870
approach_avoidance_R 5.047742 2.3403089 791 1 7 -0.6499189 -1.2634578
difficulty 5.363291 1.3286048 791 1 7 -0.6070920 -0.0754915
affordance 5.150824 1.4404649 791 1 7 -0.6002490 -0.1305322
external_importance 4.657795 1.8875778 791 1 7 -0.5408762 -0.7979445
effort 5.080051 1.5014653 791 1 7 -0.5213915 -0.4184612
urgency 4.989848 1.5097812 791 1 7 -0.4850394 -0.3154366
introjected_motivation 4.348101 2.0268390 791 1 7 -0.4139940 -1.1128018
intrinsic_motivation 4.465057 2.0195574 791 1 7 -0.4041309 -1.1038115
connectedness 4.517154 1.8848932 791 1 7 -0.3863797 -0.9558996
external_motivation 4.130545 2.0977247 791 1 7 -0.2854198 -1.3223306
advancement 6.314322 2.9241790 791 1 11 -0.1142650 -1.1289960
ought_motivation 3.898734 2.1537537 791 1 7 -0.0842966 -1.4430863
procrastination 4.048223 1.8357482 791 1 7 -0.0820074 -1.1796906
attainment_maintenance_R 3.994832 2.4741822 791 1 7 0.0632405 -1.6729519
frequency_R 1.474576 0.4996791 791 1 2 0.1016276 -1.9922642
end_state_specificity_R 1.931915 0.8940554 791 1 3 0.1333064 -1.7376529
conflict 3.510801 2.0373916 791 1 7 0.1759990 -1.3270935
failure 1.710127 0.8651974 791 1 3 0.5938757 -1.4052165
# order based on their skewness 
#kable(varDf[order(varDf$skew),])

The trend showed in these histograms are very similar to the SONA study

# histograms for each dimension
goalRating_long_R %>%
  ggplot(aes(x = rating)) +
    geom_histogram(fill   = "orange",
                   colour = "black",
                   alpha  = .6) +
    facet_wrap(~variable, nrow = 7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

correlational matrix across all variables

“pairwise.complete.obs” is used for generating correlational matrix.The correlations make sense

# transform the long format to short format
goalDf_wide <- goalRating_long_R %>% spread (variable, rating)

# generate a correctional matrix
corrM_all <- goalDf_wide %>% 
  dplyr :: select(advancement:visibility) %>% 
  cor(use = "pairwise.complete.obs")

# visualization
corrplot(corrM_all, method = "circle",number.cex = .7, order = "AOE", addCoef.col = "black",type = "upper",col= colorRampPalette(c("midnightblue","white", "orange"))(200))

### Variance Partition

Only the 31 variables for goal representation are included. Only around 8.6% of the variance is on the between subject level.

# subset the long format dataset for only the 31 goal representation variable
goal_striving <- c("commitment", "urgency", "effort", "advancement", "initial_time_R", "regret", "procrastination", "failure")
goalDf_R_long <- goalRating_long_R[!goalRating_long_R$variable %in% goal_striving,]

# generate a multilevel model with subject as the random intercept
mlm <-lmer(rating ~ variable + (1|MTurkCode), data = goalDf_R_long)

# calculate the variance partition coefficient and transform to ICC
VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(icc=vcov/sum(vcov)) %>%
  dplyr :: select(grp, icc)
## # A tibble: 2 x 2
##   grp          icc
##   <chr>      <dbl>
## 1 MTurkCode 0.0862
## 2 Residual  0.914
Raw <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Raw=vcov/sum(vcov)) %>%
  dplyr :: select(Raw)

Data transformation

26 variables are included. Ordinal variables are not included: “temporal_duration” & “end_state_specificity” and “frequency”; appoach_avoidance_R & attainment_maintainance_R are also dropped because these 2 variables are more relevant to the phrasing/content of a goal than the perception of a goal. This step is consistent with the SONA study

# Exclude the 8 variables related to goal striving progress
goalDf_R_wide <- goalDf_wide[,!names(goalDf_wide) %in% goal_striving]

# Exclude 5 goal representation variables and other columns with irrelevant data
goal_exclude <- c("temporal_duration", "end_state_specificity_R", "frequency_R", "attainment_maintenance_R", "approach_avoidance_R")
goalDf_EFA <- goalDf_R_wide[,!names(goalDf_R_wide) %in% goal_exclude]
goalDf_EFA <- subset(goalDf_EFA, select = affordance : visibility)

# Generate a correctional matrix 
corrM_raw <- cor(goalDf_EFA, use = "pairwise")

evaluate the number of factors

# use Very Simple Structure criterion
res_vss <- psych :: nfactors(corrM_raw, n = 10, rotate = "promax", diagonal = FALSE, fm = "minres", 
n.obs=791,title="Very Simple Structure",use="pairwise",cor="cor")

# select useful parameters and organize them into a table
cbind(1:10, res_vss$map) %>%
  as.tibble() %>%
  rename(., factor = V1, map = V2) %>%
  cbind(., res_vss$vss.stats) %>%
  select(factor, map, fit, complex, eChisq, SRMR, eCRMS, eBIC, eRMS) %>%
  kable(format = "html", escape = F, caption = "VSS output after dropping 2 variables") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
VSS output after dropping 2 variables
factor map fit complex eChisq SRMR eCRMS eBIC eRMS
1 0.0301970 0.5610515 1.000000 8679.43583 0.1299274 0.1354587 6684.1197 0.1299274
2 0.0146179 0.6580092 1.251094 2573.42431 0.0707474 0.0770508 744.9407 0.0707474
3 0.0132862 0.7149469 1.414591 1311.03084 0.0504965 0.0575749 -357.2937 0.0504965
4 0.0125204 0.7277976 1.574012 677.33499 0.0362958 0.0434296 -837.5036 0.0362958
5 0.0139541 0.7352400 1.731892 449.17540 0.0295572 0.0372158 -918.8507 0.0295572
6 0.0162217 0.7285363 1.891997 325.64092 0.0251666 0.0334470 -902.2459 0.0251666
7 0.0186254 0.7108767 1.969130 224.28020 0.0208858 0.0294016 -870.1407 0.0208858
8 0.0222770 0.6836729 2.094972 162.64913 0.0177861 0.0266280 -804.9791 0.0177861
9 0.0252166 0.6749553 2.050097 117.82758 0.0151384 0.0242169 -729.6813 0.0151384
10 0.0296479 0.6597003 1.803383 89.65666 0.0132052 0.0226982 -644.4061 0.0132052
# Use the Scree plot to identify the number of factors have Eigenvalues >1 and the output from the Parallel analysis

ev <- eigen(corrM_raw)
ap <- parallel(subject=nrow(goalDf_EFA),var=ncol(goalDf_EFA),
  rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

### Extract factors

Extract number of factors based on the suggestions above. Because we expect factors to be correlated with each other, we use “promax” rotation.

# extract 4 factors
fa_raw_4 <-fa(r=corrM_raw, nfactors=5,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")

# extract 5 factors
fa_raw_5 <-fa(r=corrM_raw, nfactors=5,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")

# extract 6 factors
fa_raw_6 <-fa(r=corrM_raw, nfactors=6,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")

Compare loadings for each model

4 factors

fa.diagram(fa_raw_4)

5 factors

fa.diagram(fa_raw_5)

6 factors

fa.diagram(fa_raw_6)

5 factors

Compared to the 5 factors yield from the SONA study, the factor “measurability” is combined with “attainability”, and the factor ideal is new. It’s composed by item “ideal_motivation” (used to be in factor “importance”), “Control”(“measurability”), “meaningfulness”(“importance”)

# visualization
loadings <- fa.sort(fa_raw_5)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <- c("ought", "importance", "attainability", "commonality", "ideal")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("ought", "importance", "attainability", "commonality", "ideal")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)

ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) + 
  facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
  geom_bar(stat="identity") + #make the bars
  coord_flip() + #flip the axes so the test names can be horizontal  
  #define the fill color gradient: blue=positive, red=negative
  scale_fill_gradient2(name = "Loading", 
                       high = "orange", mid = "white", low = "midnightblue", 
                       midpoint=0, guide="colourbar") +
  ylab("Loading Strength") + #improve y-axis label + 
  ggtitle("Loadings for 5 factors") + 
  theme_bw(base_size=10)

The 5 factor loadings from the SONA study:

SONA 5-factor

interfactor correlation

fa_raw_5$Phi %>% 
  as.tibble() %>% 
  dplyr::rename(ought = MR1, importance = MR2, attainability = MR3, commonality = MR4, ideal = MR5) %>%
  round(.,2) %>%
  remove_rownames() %>%
  mutate(factor = colnames(.)) %>%
  select(factor, everything()) %>%
  kable(format = "html", escape = F, caption = "Interfactor Correlation") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
Interfactor Correlation
factor ought importance attainability commonality ideal
ought 1.00 0.24 0.10 0.13 0.25
importance 0.24 1.00 0.35 0.37 0.25
attainability 0.10 0.35 1.00 0.04 0.23
commonality 0.13 0.37 0.04 1.00 -0.06
ideal 0.25 0.25 0.23 -0.06 1.00

6 factors

factor loadings

Compared to the 6 factors yield from the SONA study, the “instrumentality” is replaced by the factor “ideal”.

# visualization
loadings <- fa.sort(fa_raw_6)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <- c("ought", "importance", "measurability", "commonality", "ideal", "attainability")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("ought", "importance", "measurability", "commonality", "ideal", "attainability")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)

ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) + 
  facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
  geom_bar(stat="identity") + #make the bars
  coord_flip() + #flip the axes so the test names can be horizontal  
  #define the fill color gradient: blue=positive, red=negative
  scale_fill_gradient2(name = "Loading", 
                       high = "orange", mid = "white", low = "midnightblue", 
                       midpoint=0, guide="colourbar") +
  ylab("Loading Strength") + #improve y-axis label + 
  ggtitle("Loadings for 6 factors") + 
  theme_bw(base_size=10)

The 6 factor loadings from the SONA study:

SONA 6-factor #### interfactor correlation

fa_raw_6$Phi %>% 
  as.tibble() %>% 
  dplyr::rename(ought = MR1, importance = MR6, measurability = MR3, commonality = MR4, ideal = MR5, attainability = MR2) %>%
  round(.,2) %>%
  remove_rownames() %>%
  mutate(factor = colnames(.)) %>%
  select(factor, everything()) %>%
  kable(format = "html", escape = F, caption = "Interfactor Correlation") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
Interfactor Correlation
factor ought importance measurability commonality ideal attainability
ought 1.00 0.32 -0.05 0.16 0.12 0.23
importance 0.32 1.00 0.30 0.37 0.22 0.29
measurability -0.05 0.30 1.00 0.18 0.20 0.20
commonality 0.16 0.37 0.18 1.00 -0.07 -0.04
ideal 0.12 0.22 0.20 -0.07 1.00 0.06
attainability 0.23 0.29 0.20 -0.04 0.06 1.00

Compare model fit & complexity

# generate a dataframe 
fa_fitDf <- data.frame(factors = c(5,6),
                        chi = c(fa_raw_5$chi,fa_raw_6$chi),
                        BIC = c(fa_raw_5$BIC,fa_raw_6$BIC),
                        fit = c(fa_raw_5$fit,fa_raw_6$fit),
                        RMSEA = c(fa_raw_5$RMSEA[1],fa_raw_6$RMSEA[1]),
                       cumVar = c(max(fa_raw_5$Vaccounted[3,]), max(fa_raw_6$Vaccounted[3,])),
                        complexity = c(mean(fa_raw_5$complexity),mean(fa_raw_6$complexity)))

fa_fitDf
##   factors      chi       BIC       fit      RMSEA    cumVar complexity
## 1       5 484.9504 -680.9335 0.8555492 0.05331112 0.4384451   1.731892
## 2       6 351.5769 -669.4881 0.8679743 0.04970919 0.4579989   1.891997